{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "该notebook对文字数据进行预处理\n",
    "\n",
    "- 处理所有text信息，给出最佳的匹配方式以及最小编辑距离\n",
    "- 对所有轮廓信息进行归一化操作，得到每个text的中点、面积信息\n",
    "- 保留point_seq_id/image_id信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-11-22 01:34:43.767902: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import numpy as np\n",
    "from tqdm import tqdm\n",
    "from utils import get_area, defective_match, is_correct_text, get_bounding_box"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/train_label_public.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    train_data = json.load(f)[\"data\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normlize_contour(msg: dict):\n",
    "    # 将每个OCR框的坐标点归一化\n",
    "    origin_contours = [x[\"contour\"] for x in msg[\"texts\"]]\n",
    "    board_contour = np.array(msg[\"board_contour\"])\n",
    "    board_top_left = np.min(board_contour, axis=0).tolist()\n",
    "    board_bottom_right = np.max(board_contour, axis=0).tolist()\n",
    "    width = board_bottom_right[0] - board_top_left[0]\n",
    "    height = board_bottom_right[1] - board_top_left[1]\n",
    "    # 获取bbox坐标，便于索引图像块\n",
    "    origin_bboxs = []\n",
    "    normlize_bboxs = []\n",
    "    for contour in origin_contours:\n",
    "        top_left, bottom_right = get_bounding_box(contour)\n",
    "        bbox = [\n",
    "            max(top_left[0] - board_top_left[0], 0),\n",
    "            max(top_left[1] - board_top_left[1], 0),\n",
    "            min(bottom_right[0] - board_top_left[0], board_bottom_right[0]),\n",
    "            min(bottom_right[1] - board_top_left[1], board_bottom_right[1])\n",
    "        ]\n",
    "        origin_bboxs.append(top_left + bottom_right)\n",
    "        # 按照最长的长度归一化bbox\n",
    "        l = max(width, height)\n",
    "        normlize_bboxs.append([x / l for x in bbox])\n",
    "    return board_top_left, board_bottom_right, origin_bboxs, normlize_bboxs\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_new_msg(msg: dict):\n",
    "    # 计算每个多边形面积并归一化\n",
    "    total_areas = get_area(msg[\"board_contour\"])\n",
    "    areas = [get_area(x[\"contour\"]) for x in msg[\"texts\"]]\n",
    "    areas = [x / total_areas for x in areas]\n",
    "    # 获取text信息\n",
    "    texts = [x[\"text\"] for x in msg[\"texts\"]]\n",
    "\n",
    "    # 获取每个OCR的bbox并归一化\n",
    "    board_top_left, board_bottom_right, origin_bboxs, normlize_bboxs = normlize_contour(msg)\n",
    "    \n",
    "    # 获取文字信息\n",
    "    min_dist, rankorder = defective_match(msg)\n",
    "    return {\n",
    "        \"image_id\": msg[\"image_id\"],\n",
    "        \"point_seq_id\": msg[\"point_seq_id\"],\n",
    "        \"name\": msg[\"name\"],\n",
    "        \"texts\": texts,\n",
    "        \"areas\": areas,\n",
    "        \"min_dist\": min_dist,\n",
    "        \"rankorder\": rankorder,\n",
    "        \"board_bbox\": [board_top_left[0], board_top_left[1], board_bottom_right[0], board_bottom_right[1]],\n",
    "        \"origin_bboxs\": origin_bboxs,\n",
    "        \"normlize_bboxs\": normlize_bboxs,\n",
    "        \"contour\": [x[\"contour\"] for x in msg[\"texts\"]],\n",
    "        \"board_contour\": msg[\"board_contour\"]\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'image_id': '0022_92293827523_77f214c545bcef6e56d1e471084fcf69', 'point_seq_id': 'a795e456-b10c-11eb-8e60-0242ac1e1a04', 'name': '副食粮油', 'texts': ['副食粮油', '名烟名酒百货石磨香油'], 'areas': [0.6800398200006474, 0.13639159571368448], 'min_dist': 0, 'rankorder': [0, -1], 'board_bbox': [346, 926, 718, 1167], 'origin_bboxs': [[356, 941, 699, 1121], [376, 1084, 680, 1159]], 'normlize_bboxs': [[0.026881720430107527, 0.04032258064516129, 0.9489247311827957, 0.5241935483870968], [0.08064516129032258, 0.42473118279569894, 0.8978494623655914, 0.6263440860215054]], 'contour': [[[685, 941], [699, 944], [699, 1050], [682, 1055], [618, 1073], [595, 1079], [563, 1087], [508, 1100], [477, 1107], [412, 1121], [368, 1121], [356, 1093], [356, 1004], [455, 984], [659, 944], [677, 941]], [[680, 1084], [680, 1110], [448, 1150], [395, 1159], [376, 1159], [376, 1135], [393, 1132], [586, 1098], [651, 1087], [664, 1085], [673, 1084]]], 'board_contour': [[364, 1167], [718, 1101], [690, 926], [346, 997]]}\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "key = random.choice(list(train_data.keys()))\n",
    "new_msg = get_new_msg(train_data[key])\n",
    "print(new_msg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 21034/21034 [00:05<00:00, 4140.22it/s]\n"
     ]
    }
   ],
   "source": [
    "new_train_data = []\n",
    "for key in tqdm(train_data):\n",
    "    msg = get_new_msg(train_data[key])\n",
    "    new_train_data.append(msg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "可以正确拼接为name的数据量为17496\n",
      "训练数据总量为21034\n",
      "正确数据占比: 0.8317961395835314\n"
     ]
    }
   ],
   "source": [
    "correct_num = 0\n",
    "for d in new_train_data:\n",
    "    if d[\"min_dist\"] == 0:\n",
    "        correct_num += 1\n",
    "print(f\"可以正确拼接为name的数据量为{correct_num}\")\n",
    "print(f\"训练数据总量为{len(new_train_data)}\")\n",
    "print(f\"正确数据占比: {correct_num / len(new_train_data)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "board_bbox_l_stat = []\n",
    "bbox_l_stat = []\n",
    "def get_l_and_stat(bbox, stat: list):\n",
    "    l1, l2 = bbox[2] - bbox[0], bbox[3] - bbox[1]\n",
    "    stat += [l1, l2]\n",
    "for d in new_train_data:\n",
    "    get_l_and_stat(d[\"board_bbox\"], board_bbox_l_stat)\n",
    "    for bbox in d[\"origin_bboxs\"]:\n",
    "        get_l_and_stat(bbox, bbox_l_stat)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:ylabel='Count'>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAD4CAYAAAAdIcpQAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAaeUlEQVR4nO3de5Bc5Xnn8e/PUsCAJEBoYIcRMEIrKAG1K5sJS+LgIiGJMfEacBIiymvYDRuBFzZm7U0BoSqmQqnKToKdOCFyhK3CJBgsBxMpAWxj1oHaKjAeYXEVymi4jiRLY1gvonBpM+LZP/ptcdR095kZdZ/Tl9+nqkun3z6n+9HpyzPnvSoiMDMza+Y9ZQdgZmadz8nCzMxyOVmYmVkuJwszM8vlZGFmZrnmlh1AuyxatCiGh4fLDsPMrKts2rTpJxExUFves8lieHiY0dHRssMwM+sqkl6uV+5qKDMzy+VkYWZmuZwszMwsl5OFmZnlcrIwM7NcThZmZpbLycLMzHI5WZiZWa6eHZTXTaamphgbG9t/f9myZcyd67fGzDqHf5E6wNjYGFfeeh/zBoZ4c3I7f3P1b7B8+fKywzIz28/JoiTZq4nx8XGOWHQ8CwaHyw3KzKyBtiULSeuAjwC7I+KMVPYN4NS0y1HATyNihaRhYAuwNT32WERclY45E7gdOAy4H/hU9MBasNmrid1bn2D+Scs5suygzMwaaOeVxe3AXwF3VAsi4neq25JuAf5vZv/xiFhR53nWAKuAx6gki/OBB1ofbnvVtktkrybenNxeYmRmZvnaliwi4pF0xfAukgRcAvxKs+eQNAgsiIhH0/07gIvowmSRvZIAfDVhZl2lrK6z5wC7ImIsU7ZE0o8kPSzpnFQ2BExk9plIZXVJWiVpVNLo5ORk66M+SPMGhlgwOMyCwWEOX3hc2eGYmU1bWQ3clwJ3Ze7vBE6MiNdSG8U/SDodUJ1jG7ZXRMRaYC3AyMhI6e0atY3YPdDUYmZ9qvBkIWku8DHgzGpZROwF9qbtTZLGgVOoXEkszhy+GNhRXLQHx43YZtYryqiG+lXg+YjYX70kaUDSnLR9MrAMeCEidgJ7JJ2d2jkuAzaUEPOsVaueXO1kZt2sbclC0l3Ao8CpkiYkXZEeWsmBVVAAHwSekvQk8PfAVRHxenrsk8BXgG3AOF3YuG1m1u3a2Rvq0gbl/7lO2T3APQ32HwXOaGlwZmY2I55I0MzMcjlZmJlZLicLMzPL5WRhZma5nCzMzCyXpyhvsYMdtf32vn2Mj4/vv++FkMysE/hXqMUOdtT2W6//mJs3vMwxi/d4ISQz6xhOFm1QHbU926nHD1806IWQzKyjuM3CzMxyOVmYmVkuJwszM8vlZGFmZrmcLMzMLJd7Q3Uwj7kws07hX54O5jEXZtYpnCxaoJ1rbXvMhZl1AieLFihire3aKilwtZSZFce/NC1ysKO282SrpABXS5lZoZwsuoirpMysLG3rOitpnaTdkp7JlN0kabukzel2QeaxGyRtk7RV0ocy5WdKejo99iVJalfMZmZWXzvHWdwOnF+n/IsRsSLd7geQdBqwEjg9HfPXkuak/dcAq4Bl6VbvOc3MrI3aliwi4hHg9WnufiFwd0TsjYgXgW3AWZIGgQUR8WhUuhjdAVzUloDNzKyhMkZwXyPpqVRNdXQqGwJezewzkcqG0nZtuZmZFajoZLEGWAqsAHYCt6Tyeu0Q0aS8LkmrJI1KGp2cnDzIUM3MrKrQZBERuyJiX0S8DdwGnJUemgBOyOy6GNiRyhfXKW/0/GsjYiQiRgYGBlobvJlZHys0WaQ2iKqLgWpPqY3ASkmHSlpCpSH78YjYCeyRdHbqBXUZsKHImM3MrI3jLCTdBZwLLJI0AXwWOFfSCipVSS8BVwJExLOS1gPPAVPA1RGxLz3VJ6n0rDoMeCDdzMysQG1LFhFxaZ3irzbZfzWwuk75KHBGC0MzM7MZ8gjuWWrn5IFmZp3GyWKWipg80MysU3ilvINQnTzw8IXHlR2KmVlbOVmYmVkuJwszM8vlZGFmZrmcLMzMLJd7Q/WAbDde8HKrZtZ6/kXpAdluvF5u1czawcmiR1S78ZqZtYOTRY95e98+xsfH9993lZSZtYJ/RXrMW6//mJs3vMwxi/e4SsrMWsbJogcdvmjQVVJm1lLuOmtmZrmcLMzMLJeThZmZ5XKbRZfK9nryehpm1m5OFl0q2+vJ62mYWbu5GqqLVXs9eT0NM2s3JwszM8vVtmQhaZ2k3ZKeyZT9qaTnJT0l6V5JR6XyYUk/k7Q53b6cOeZMSU9L2ibpS5LUrpjNzKy+dl5Z3A6cX1P2IHBGRPw74F+AGzKPjUfEinS7KlO+BlgFLEu32uc0M7M2a1uyiIhHgNdryr4bEVPp7mPA4mbPIWkQWBARj0alu88dwEVtCNfMzJoos83id4EHMveXSPqRpIclnZPKhoCJzD4TqczMzApUStdZSTcCU8CdqWgncGJEvCbpTOAfJJ0O1GufaDigQNIqKlVWnHjiia0N2sysjxV+ZSHpcuAjwMdT1RIRsTciXkvbm4Bx4BQqVxLZqqrFwI5Gzx0RayNiJCJGBgYG2vVfMDPrO4UmC0nnA9cBH42ItzLlA5LmpO2TqTRkvxARO4E9ks5OvaAuAzYUGbOZmbWxGkrSXcC5wCJJE8BnqfR+OhR4MPWAfSz1fPog8MeSpoB9wFURUW0c/ySVnlWHUWnjyLZzmJlZAdqWLCLi0jrFX22w7z3APQ0eGwXOaGFoZmY2Qx7BbWZmuZwszMwsl5OFmZnlcrIwM7NcXs+ih2UXSAJYtmwZc+f6LTezmfMvRw/LLpD05uR2/ubq32D58uVlh2VmXcjJosdVF0gyMzsYbrMwM7NcThZmZpbLycLMzHI5WZiZWS4nCzMzy+VkYWZmuZwszMwsl8dZ9Ina0dzgEd1mNn3+pegT2dHcgEd0m9mMOFn0kexobs8bZWYz4V+HPuV5o8xsJqbVwC3pA9Mps+5SvdKYNzBUdihm1uGme2Xxl8D7p1FmXchVUmaWp+kvgqRfAH4RGJD06cxDC4A5OceuAz4C7I6IM1LZQuAbwDDwEnBJRPyf9NgNwBXAPuD3I+I7qfxM4HbgMOB+4FMRETP5T1pzrpIyszx51VCHAPOoJJX5mdsbwG/lHHs7cH5N2fXAQxGxDHgo3UfSacBK4PR0zF9LqiajNcAqYFm61T5nIaamptiyZcv+2/j4OL2Us1wlZWbNNL2yiIiHgYcl3R4RL8/kiSPiEUnDNcUXAuem7a8B/wxcl8rvjoi9wIuStgFnSXoJWBARjwJIugO4CHhgJrG0wtjYGFfeet/+H9PdW59g/knLObLoQMzMSjDdiulDJa2lUn20/5iI+JUZvt5xEbEzHbtT0rGpfAh4LLPfRCr717RdW16XpFVUrkI48cQTZxhavnkDQ/u7nr45ub3lz29m1qmmmyy+CXwZ+AqVNoVWU52yaFJeV0SsBdYCjIyM9E4dkZlZyaabLKYiYk0LXm+XpMF0VTEI7E7lE8AJmf0WAztS+eI65WZmVqDpTiT4j5L+m6RBSQurt1m83kbg8rR9ObAhU75S0qGSllBpyH48VVntkXS2JAGXZY4xM7OCTPfKovoD/weZsgBObnSApLuoNGYvkjQBfBb4HLBe0hXAK8BvA0TEs5LWA88BU8DVEVGt7vok73SdfYASGrf7icdcmFk90/oViIglM33iiLi0wUPnNdh/NbC6TvkocMZMX99mx2MuzKyeaSULSZfVK4+IO1objnWC7ISDZmYw/Wqon89sv5fK1cETgJOFmVkfmG411H/P3pd0JPC3bYnIzMw6zmyXVX2LSo8lMzPrA9Nts/hH3hkMNwdYDqxvV1BmZtZZpttm8WeZ7Sng5YiYaLSz9Z6pqSnGxsYOKHO3WrP+Md02i4clHcc7Dd1jzfa33lM7kaK71Zr1l+mulHcJ8DiVQXSXAD+QlDdFufWY6kSKnsrcrP9Mtw7hRuDnI2I3gKQB4HvA37crMCtfdjR3r63fYWYzM91k8Z5qokheY/Y9qaxLZEdze/0Os/423WTxbUnfAe5K93+HyhKn1uOqo7m9fodZf8tbg/vfUlmw6A8kfQz4JSprTDwK3FlAfGZm1gHyqpL+HNgDEBHfiohPR8T/oHJV8eftDc3MzDpFXrIYjoinagvTTLDDbYnIzMw6Tl6yeG+Txw5rZSBmZta58hq4fyjp9yLitmxhWrxoU/vCsk7nRZLM+kvet/ta4F5JH+ed5DACHAJc3Ma4rMN5kSSz/tI0WUTELuAXJf0y76xWd19E/K+2R2Ydz4skmfWP6c4N9X3g+22OxczMOpRHYZuZWa7Ck4WkUyVtztzekHStpJskbc+UX5A55gZJ2yRtlfShomM2M+t3hXdfiYitwAoASXOA7cC9wH8BvhgR2bUzkHQasBI4HTge+J6kUyJiX5Fxm5n1s7Kroc4DxiPi5Sb7XAjcHRF7I+JFYBtwViHRmZkZUH6yWMk7kxMCXCPpKUnrJB2dyoaAVzP7TKSyd5G0StKopNHJycn2RGxNTU1NsWXLlv23qampskMysxYoLVlIOgT4KPDNVLQGWEqlimoncEt11zqH111YISLWRsRIRIwMDAy0NmCbluqKep9Zv5krb73vXUuxmll3KnPI7YeBJ9JYjuqYDgAk3Qb8U7o7AZyQOW4xsKOoIC1f7SJJRyw63uMvzHpMmcniUjJVUJIGI2Jnunsx8Eza3gh8XdIXqDRwL6OyxKt1CC+SZNb7SkkWkg4Hfg24MlP8J5JWUKlieqn6WEQ8K2k98BwwBVztnlCdx4skmfW2UpJFRLwFHFNT9okm+68GVrc7LjMzq6/s3lBmZtYFnCzMzCyXk4WZmeVysjAzs1xOFmZmlsvrYFohpqam3jWa20uxmnUPf1OtENVpQOYNVKb18lKsZt3FyaKJ7F/D4+PjRNSdksoa8DQgZr3DyaKJ7F/DnsZi5jwNiFnvcAN3jnkDQywYHObwhceVHUpXqk4D4vNn1t2cLMzMLJeThZmZ5XKyMDOzXE4WZmaWy72hrBTZbrXgAXpmnc7fTitFtlvtnl2vcN0Fp7N06VLAicOsE/kbaaXJrq5384YnOWbxHo/sNutQThbWEaqJw8w6kxu4zcwsl5OFmZnlKiVZSHpJ0tOSNksaTWULJT0oaSz9e3Rm/xskbZO0VdKHyojZzKyfldlm8csR8ZPM/euBhyLic5KuT/evk3QasBI4HTge+J6kUyJiX/EhW7u5S61ZZ+qkb+GFwLlp+2vAPwPXpfK7I2Iv8KKkbcBZwKMlxGht5i61Zp2prDaLAL4raZOkVansuIjYCZD+PTaVDwGvZo6dSGXvImmVpFFJo5OTk20K3dqt2jNK75nDzRue5DPrN3Plrfe9a6U9MytOWX+mfSAidkg6FnhQ0vNN9lWdsrqrEEXEWmAtwMjIiFcq6gHuUmvWGUq5soiIHenf3cC9VKqVdkkaBEj/7k67TwAnZA5fDOwoLlozMys8WUg6QtL86jbw68AzwEbg8rTb5cCGtL0RWCnpUElLgGXA48VGbWbW38qohjoOuFdS9fW/HhHflvRDYL2kK4BXgN8GiIhnJa0HngOmgKvdE8rMrFiFJ4uIeAH493XKXwPOa3DMamB1m0MzM7MGPILbzMxyOVmYmVkuj3CyrlA7shs8SM+sSP6mWVfIjuwGDlj3Ympq6oABe04iZq3nb5R1jUYD9MbGxrjy1vuYNzDkKULM2sTfIutK2Wqp8fFxjlh0vFfdM2sjJwvrStlqqd1bn2D+Scs5Mj3mKULMWs+9oaxrVZPC4QuPKzsUs57nZGFmZrmcLMzMLJfbLKxnedU9s9bxN8d6VrYR3D2jzA6Ok4X1tGojuEeAmx0cf1OsLzQbAW5m+ZwsrG94/IXZ7DlZWF9y47fZzPjbYX3Jjd9mM+NkYX2rXuP31NQUAHPnzj1gG3z1Yf3Nn3zre7XzTM054iiOWXzyAdu++rB+52RhxjtXGW9ObmfO/IXv2jbrd4UnC0knAHcA/wZ4G1gbEX8h6Sbg94DJtOsfRsT96ZgbgCuAfcDvR8R3io7b+ttsxml4USbrJWV8cqeAz0TEE5LmA5skPZge+2JE/Fl2Z0mnASuB04Hjge9JOiUi9hUatfW12YzTyC7K5Gos63aFJ4uI2AnsTNt7JG0BhpocciFwd0TsBV6UtA04C3i07cGaZWTHaUy36+28gSFXY1lPKHXWWUnDwPuAH6SiayQ9JWmdpKNT2RDwauawCRokF0mrJI1KGp2cnKy3i1lLVK40nuQz6zdz5a33HVDdZNaLSksWkuYB9wDXRsQbwBpgKbCCypXHLdVd6xwe9Z4zItZGxEhEjAwMDLQ+aLOM6pXGvIFmF8ZmvaGU1jZJP0clUdwZEd8CiIhdmcdvA/4p3Z0ATsgcvhjYUVCoZrk8Gtz6QRm9oQR8FdgSEV/IlA+m9gyAi4Fn0vZG4OuSvkClgXsZ8HiBIZs15dHg1g/K+PPnA8AngKclbU5lfwhcKmkFlSqml4ArASLiWUnrgeeo9KS62j2hrNN4kkLrdWX0hvrf1G+HuL/JMauB1W0LyqxFslVS4+PjRNRtXjPrOq5YNWuh2qlD5p+0nCOZ3aA+s07iT6pZi2WnDqny4kvW7ZwsamSnaHA1grWS2zWsmzlZ1MhO0ZCtRjBrF88hZd3An8g6qlM0ZKsRzNqllXNIOfFYu/hTZFaC2l5TRyw6ftZVVLVVp59/YAvzj13sdhFrKScLsxJMp9dU7Up9jVbuq1d16rYRazUnC7OS5PWayq7UBzRduc9Vp9ZuThZmHabeqn2AV+6zUjlZmHUhjxS3ojlZmHWhRm0e3c69uTqX3wWzLlWvzaPbeSnazuVkYWYdxUvRdqZSl1U1M7Pu4CsLsz7h9gA7GP6kmPWJbHvAnl2vcN0Fp7N06dIDBvs1GvgHjZONk1B/8Dtq1qNq19DITivy5uR2bt7w5LsG/2W3swmleny9qUQaNUrXJhFwIulmftfMelTtGhq1XWzrDf6r3a4mlOzxCwaHG85tVVteTS5Aw0SSvZrxmJHO5WRh1sOya2jMpotto+MbjfOoV149vlEiqb2y6ZUxI73GycLMZqXROI9G5Y0SSe3VjHWmrkkWks4H/gKYA3wlIj5XckhmNkMzGUjodcs7S1ecdUlzgFuBXwMmgB9K2hgRz5UbmZm1S22by3R6cE23N9d0jp/uFPHTed6ijq/9P7dSVyQL4CxgW0S8ACDpbuBCoC3JovpXz1uv72LO3r288d7DDthu9th0tn28j++G1+yI4484av/38mc//QnXr/s2Rx07xOuvPM973ju/6fZbP93NH/+n8w7ozfVHf/cQhx917LSOz24DDZ97Os9b1PFv/XQ3d352VVumSFE39DyQ9FvA+RHxX9P9TwD/ISKuqdlvFbAq3T0V2DqDl1kE/KQF4bZaJ8bViTGB45qJTowJOjOuTowJ2hfXSRExUFvYLVcWqlP2riwXEWuBtbN6AWk0IkZmc2w7dWJcnRgTOK6Z6MSYoDPj6sSYoPi4umVuqAnghMz9xcCOkmIxM+s73ZIsfggsk7RE0iHASmBjyTGZmfWNrqiGiogpSdcA36HSdXZdRDzb4peZVfVVAToxrk6MCRzXTHRiTNCZcXViTFBwXF3RwG1mZuXqlmooMzMrkZOFmZnlcrKgMpWIpK2Stkm6vsDXPUHS9yVtkfSspE+l8pskbZe0Od0uyBxzQ4pzq6QPtTG2lyQ9nV5/NJUtlPSgpLH079FFxSXp1Mz52CzpDUnXlnGuJK2TtFvSM5myGZ8bSWemc7xN0pck1esifjAx/amk5yU9JeleSUel8mFJP8ucsy+3I6Ymcc34PSsorm9kYnpJ0uZUXsj5avJ7UOpna7+I6OsblQbzceBk4BDgSeC0gl57EHh/2p4P/AtwGnAT8D/r7H9aiu9QYEmKe06bYnsJWFRT9ifA9Wn7euDzRceVec9+DJxUxrkCPgi8H3jmYM4N8DjwC1TGET0AfLjFMf06MDdtfz4T03B2v5rnaVlMTeKa8XtWRFw1j98C/FGR54vGvwelfraqN19ZZKYSiYj/B1SnEmm7iNgZEU+k7T3AFmCoySEXAndHxN6IeBHYRiX+olwIfC1tfw24qKS4zgPGI+LlJvu0LaaIeAR4vc7rTfvcSBoEFkTEo1H5dt+ROaYlMUXEdyNiKt19jMr4pIZaHVOjuJoo5FzlxZX+Cr8EuKvZc7ThPWz0e1DqZ6vKyaLyZryauT9B8x/stpA0DLwP+EEquiZVH6zLXHYWGWsA35W0SZVpVACOi4idUPlgA8eWEBdUxtlkv8hlnyuY+bkZSttFxfe7VP7CrFoi6UeSHpZ0TibWomKayXtW9Lk6B9gVEdll/go9XzW/Bx3x2XKymOZUIm0NQJoH3ANcGxFvAGuApcAKYCeVS2IoNtYPRMT7gQ8DV0v6YJN9C4tLlUGZHwW+mYo64Vw10yiOIs/ZjcAUcGcq2gmcGBHvAz4NfF3SggJjmul7VvR7eSkH/jFS6Pmq83vQcNcGr9+WuJwsSp5KRNLPUflg3BkR3wKIiF0RsS8i3gZu453qk8JijYgd6d/dwL0phl3pErd6Cb676LioJK8nImJXiq/0c5XM9NxMcGC1UFvik3Q58BHg46lKglRt8Vra3kSlrvuUomKaxXtWSFwAkuYCHwO+kYm3sPNV7/eADvlsOVmUOJVIqhv9KrAlIr6QKR/M7HYxUO2xsRFYKelQSUuAZVQaslod1xGS5le3qTSUPpNe//K02+XAhiLjSg74q6/sc5Uxo3OTqhP2SDo7fQ4uyxzTEqosGHYd8NGIeCtTPqDKGjFIOjnF9EIRMaXXnNF7VlRcya8Cz0fE/mqcos5Xo98DOuWzdbAt5L1wAy6g0vNgHLixwNf9JSqXh08Bm9PtAuBvgadT+UZgMHPMjSnOrbSgh0ODuE6m0sviSeDZ6jkBjgEeAsbSvwsLjutw4DXgyExZ4eeKSrLaCfwrlb/irpjNuQFGqPxQjgN/RZpRoYUxbaNSp139bH057fub6X19EngC+I/tiKlJXDN+z4qIK5XfDlxVs28h54vGvwelfraqN0/3YWZmuVwNZWZmuZwszMwsl5OFmZnlcrIwM7NcThZmZpbLycLMzHI5WZiZWa7/DxkKSZdWMuCxAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import seaborn as sns\n",
    "\n",
    "\n",
    "sns.histplot(board_bbox_l_stat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "54\n",
      "42068\n"
     ]
    }
   ],
   "source": [
    "x = np.array(board_bbox_l_stat)\n",
    "print(np.sum(x < 32))\n",
    "print(len(board_bbox_l_stat))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:ylabel='Count'>"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAD4CAYAAADGmmByAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjQuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8rg+JYAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZMklEQVR4nO3df5Rc9X3e8fezO7trbCwbwsJRJBqJRnUDnNPYrInAP04bxUVxUwuntpEbozXGyCUiteM2LdTn1P5Hp7h1XQdzkJERRnIwWHHsg9wWx0Qm2EkwYsHYIGQVYWxQUKSNUyOlRjO7O5/+Md9ZLqvZ1a7uzr0zu8/rnDlz53vvnfuZu6N5dH99ryICMzOzU9VTdgFmZtbdHCRmZpaLg8TMzHJxkJiZWS4OEjMzy6VSdgFFO+uss2LFihVll2Fm1lUeeeSRv42IwVbjFl2QrFixgpGRkbLLMDPrKpJ+Mt0479oyM7NcHCRmZpaLg8TMzHJxkJiZWS4OEjMzy8VBYmZmuThIzMwsFweJmZnl4iA5BRFBtVrF93IxM3OQnJJarcYVN++mVquVXYqZWenaFiSSbpd0RNITmbYzJd0n6an0fEZm3A2SDkjaL+myTPtFkh5P426SpNQ+IOnLqf0hSSva9Vla6a30F7k4M7OO1c4tkjuAtVPargd2R8QqYHd6jaTzgfXABWmeWyT1pnm2ABuBVenRfM+rgf8bEb8M/A/gk237JGZmNq22BUlEfBv4uynN64DtaXg7cHmm/e6IqEbEM8AB4GJJS4ElEfFgNA5I7JgyT/O9vgKsaW6tmJlZcYo+RnJORBwCSM9np/ZlwHOZ6Q6mtmVpeGr7y+aJiHHgBeAXWi1U0kZJI5JGRkdH5+mjmJkZdM7B9lZbEjFD+0zznNgYsTUihiJiaHCwZXf6ZmZ2iooOksNpdxXp+UhqPwicm5luOfB8al/eov1l80iqAK/hxF1pZmbWZkUHyS5gOA0PA/dk2tenM7FW0jiovift/jomaXU6/rFhyjzN93oX8K3whR1mZoVr2x0SJd0F/FPgLEkHgY8DNwI7JV0NPAu8GyAi9kraCTwJjAObImIivdW1NM4AOw24Nz0AtgFflHSAxpbI+nZ9lqxqtUq1Wi1iUWZmXaFtQRIR751m1Jpppt8MbG7RPgJc2KL9OCmIzMysPJ1ysN3MzLqUg8TMzHJxkJiZWS4OEjMzy8VBYmZmuThIzMwsFweJmZnl4iAxM7NcHCRmZpaLg8TMzHJxkJiZWS4OEjMzy8VBYmZmuThIzMwsFweJmZnl4iAxM7NcHCRmZpaLg8TMzHJxkJiZWS4OEjMzy8VBkkO1WqVarZZdhplZqRwkZmaWi4PEzMxycZCYmVkuDhIzM8vFQWJmZrk4SMzMLBcHiZmZ5eIgMTOzXBwkZmaWi4PEzMxyKSVIJP2+pL2SnpB0l6RXSDpT0n2SnkrPZ2Smv0HSAUn7JV2Wab9I0uNp3E2SVMbnMTNbzAoPEknLgH8LDEXEhUAvsB64HtgdEauA3ek1ks5P4y8A1gK3SOpNb7cF2AisSo+1BX4UMzOjvF1bFeA0SRXglcDzwDpgexq/Hbg8Da8D7o6IakQ8AxwALpa0FFgSEQ9GRAA7MvOYmVlBCg+SiPhr4FPAs8Ah4IWI+CZwTkQcStMcAs5OsywDnsu8xcHUtiwNT203M7MClbFr6wwaWxkrgV8EXiXpfTPN0qItZmhvtcyNkkYkjYyOjs61ZDMzm0EZu7Z+A3gmIkYjYgz4KnApcDjtriI9H0nTHwTOzcy/nMausINpeGr7CSJia0QMRcTQ4ODgvH4YM7PFrowgeRZYLemV6SyrNcA+YBcwnKYZBu5Jw7uA9ZIGJK2kcVB9T9r9dUzS6vQ+GzLztE1EcPToUer1ersXZWbWFSpFLzAiHpL0FeBRYBz4HrAVOB3YKelqGmHz7jT9Xkk7gSfT9JsiYiK93bXAHcBpwL3p0Va1Wo1rvvAgA69c0u5FmZl1hcKDBCAiPg58fEpzlcbWSavpNwObW7SPABfOe4En0VPpK3qRZmYdy1e2m5lZLg4SMzPLxUFiZma5OEjMzCwXB4mZmeXiIDEzs1wcJGZmlouD5BRNjNeoVqtll2FmVjoHiZmZ5eIgMTOzXBwkZmaWi4PEzMxycZCYmVkuDpJTFBFUq1Uat4s3M1u8HCSnqD4xzoe276FWq5VdiplZqRwkOfi+JGZmDhIzM8vJQWJmZrk4SOZBtVp1dylmtmg5SMzMLBcHiZmZ5eIgMTOzXBwkZmaWi4PEzMxycZCYmVkuDhIzM8vFQWJmZrk4SMzMLBcHiZmZ5eIgMTOzXBwkOfjmVmZmDpJc6hPjXH37d31zKzNb1EoJEkmvlfQVST+UtE/SJZLOlHSfpKfS8xmZ6W+QdEDSfkmXZdovkvR4GneTJBX9WXoq/UUv0syso5S1RfKHwDci4h8D/wTYB1wP7I6IVcDu9BpJ5wPrgQuAtcAtknrT+2wBNgKr0mNtkR/CzMxKCBJJS4C3AtsAIqIWET8D1gHb02TbgcvT8Drg7oioRsQzwAHgYklLgSUR8WA0DlLsyMxjZmYFKWOL5DxgFPiCpO9Juk3Sq4BzIuIQQHo+O02/DHguM//B1LYsDU9tP4GkjZJGJI2Mjo7O76cxM1vkZhUkkt40m7ZZqgBvALZExOuB/0fajTXd4lu0xQztJzZGbI2IoYgYGhwcnGu92fdJd0L0WVpmZk2z3SL57CzbZuMgcDAiHkqvv0IjWA6n3VWk5yOZ6c/NzL8ceD61L2/R3ja1Wo3hrQ8Q9ZeCZGK85tvsmtmiVplppKRLgEuBQUkfzYxaAvS2nmtmEfE3kp6T9LqI2A+sAZ5Mj2HgxvR8T5plF/AlSZ8GfpHGQfU9ETEh6Zik1cBDwAZOPdxmrbfSx8T4RLsXY2bWNWYMEqAfOD1N9+pM+1HgXTmW+3vAnZL6gR8BV9HYOtop6WrgWeDdABGxV9JOGkEzDmyKiOYv+bXAHcBpwL3pYWZmBZoxSCLiAeABSXdExE/ma6ER8Rgw1GLUmmmm3wxsbtE+Alw4X3WZmdncnWyLpGlA0lZgRXaeiPj1dhRlZmbdY7ZB8sfA54DbAB8gMDOzSbMNkvGI2NLWSszMrCvN9vTfr0v6XUlLU59YZ0o6s62VdRn3BGxmi9Vsg2QY+APgr4BH0mOkXUV1k2aAVKtVrrh5t3sCNrNFZ1ZBEhErWzzOa3dx3aA+Mc6Htu+hVqvR656AzWwRmtUxEkkbWrVHxI75Lac79VT6yi7BzKw0sz3Y/sbM8CtoXO/xKI0ed83MbBGbVZBExO9lX0t6DfDFtlRkZmZd5VS7kf85jT6vzMxskZvtMZKv81Lf6b3ArwA721WUmZl1j9keI/lUZngc+ElEHJxuYjMzWzxme/rvA8APafQAfAbgiyXMzAyY/R0S3wPsodG1+3uAhyTl6UbezMwWiNnu2voY8MaIOAIgaRD4Mxp3NzQzs0Vstmdt9TRDJPnpHOY1M7MFbLZbJN+Q9KfAXen1FcD/bk9JZmbWTU52z/ZfBs6JiD+Q9NvAmwEBDwJ3FlCfmZl1uJPtnvoMcAwgIr4aER+NiN+nsTXymfaW1j2aPQCbmS1GJwuSFRHxg6mN6V7pK9pSURdq9gBcr/teJGa2+JwsSF4xw7jT5rOQbucegM1ssTpZkDws6ZqpjZKupnFzKzMzW+ROdtbWR4CvSfodXgqOIaAfeGcb6zIzsy4xY5BExGHgUkn/DLgwNf+viPhW2yszM7OuMNv7kdwP3N/mWhaE5tlbAwMDJVdiZlYMX51uZma5OEjMzCwXB4mZmeXiIJknE+NjRL1edhlmZoVzkJiZWS4OEjMzy8VBYmZmuZQWJJJ6JX1P0v9Mr8+UdJ+kp9LzGZlpb5B0QNJ+SZdl2i+S9Hgad5MklfFZzMwWszK3SD4M7Mu8vh7YHRGrgN3pNZLOB9YDFwBrgVsk9aZ5tgAbgVXpsbaY0s3MrKmUIJG0HPgXwG2Z5nXA9jS8Hbg80353RFQj4hngAHCxpKXAkoh4MCIC2JGZx8zMClLWFslngP8AZM+XPSciDgGk57NT+zLgucx0B1PbsjQ8tf0EkjZKGpE0Mjo6Oi8fYCbNG1018s3MbGErPEgk/RZwJCJm2w19q+MeMUP7iY0RWyNiKCKGBgcHZ7nYuZsYr1GtVqnValxx825qtVrblmVm1ilm1WnjPHsT8A5Jb6dx46wlkv4IOCxpaUQcSrutjqTpDwLnZuZfDjyf2pe3aO8IvZX+skswMytE4VskEXFDRCyPiBU0DqJ/KyLeB+wChtNkw8A9aXgXsF7SgKSVNA6q70m7v45JWp3O1tqQmacU3qVlZotRJ11HciPwNklPAW9Lr4mIvcBO4EngG8CmiJhI81xL44D9AeBp4N6ii85q3rvdu7TMbDEpY9fWpIj4c+DP0/BPgTXTTLcZ2NyifYSXbrjVEXzvdjNbbDppi8TMzLqQg2QOqtUq9bqPf5iZZTlIzMwsFweJmZnl4iAxM7NcHCRmZpaLg8TMzHJxkLRZtVqlWq2WXYaZWds4SOZZs5sUM7PFwkEyz5rdpPh6EzNbLBwkbRBA1Osnnc7MbCFwkJiZWS4OEjMzy8VBYmZmuThI2sQ3uTKzxcJB0ib1iTGuvPU71Go1h4qZLWgOkjbqSfdtr9VqXHHzbt850cwWJAdJG02M1yYvTuxNoWJmttA4SMzMLBcHiZmZ5eIgMTOzXBwkbeSztcxsMXCQtFGzA0efrWVmC5mDpM16Kn1ll2Bm1lYOEjMzy8VBYmZmuThICuTb7prZQuQgaTPfetfMFjoHSZv51rtmttA5SArgW++a2ULmICmYL1I0s4Wm8CCRdK6k+yXtk7RX0odT+5mS7pP0VHo+IzPPDZIOSNov6bJM+0WSHk/jbpKkoj/PXLlLeTNbaMrYIhkH/l1E/AqwGtgk6XzgemB3RKwCdqfXpHHrgQuAtcAtknrTe20BNgKr0mNtkR/kVLlLeTNbSAoPkog4FBGPpuFjwD5gGbAO2J4m2w5cnobXAXdHRDUingEOABdLWgosiYgHo7GfaEdmno6TvTeJmdlCUuoxEkkrgNcDDwHnRMQhaIQNcHaabBnwXGa2g6ltWRqe2t5qORsljUgaGR0dndfPMFs+NmJmC1VpQSLpdOBPgI9ExNGZJm3RFjO0n9gYsTUihiJiaHBwcO7FzgN34GhmC1UpQSKpj0aI3BkRX03Nh9PuKtLzkdR+EDg3M/ty4PnUvrxFe8fKduDoLRQzWyjKOGtLwDZgX0R8OjNqFzCchoeBezLt6yUNSFpJ46D6nrT765ik1ek9N2Tm6Xg+e8vMFopKCct8E3Al8Likx1LbfwJuBHZKuhp4Fng3QETslbQTeJLGGV+bImIizXctcAdwGnBvenSsqd2l+OwtM1sICg+SiPgLWh/fAFgzzTybgc0t2keAC+evuvaqT4zzwW1/yStOf+1kWzNYBgYGSqrKzCwfX9leMN/oyswWGgeJmZnl4iApgbuWN7OFxEFSgrHqz3n/1u+4a3kzWxAcJCXxNSVmtlA4SEoyMT42eY8SX1NiZt3MQdIhfE2JmXUrB0mJxseqHD161Lu0zKyrOUhK5I4czWwhcJCUzAfdzazbOUhKlr2mxAfdzawbOUhK1ux/a2K80Q+lD7qbWbdxkHQA979lZt2sjG7k7SSax0kk0d/fT+N2K2ZmnclbJB2ieSpwk4+XmFm3cJB0iOapwBMT9cktkt5KP9Vq1R08mllHc5B0kABqL/49w7d+m2PHjpVdjpnZrDhIOpHEh7bveVnvwN4yMbNO5SDpUD2VvhMuUPQFi2bWiRwkHWpifIyJsSpX3faXkwfcfQDezDqRT//tYBPjY/T0Vjh+/PjkVkjzgsWIoFar+fRgMyudt0g6XH1inA1b7udfT9kSaW6dNI+deHeXmZXFQdIFeip9qLfC0aNHqdfrk+HRW+n37i4zK52DpEvUJ8bZ9Ecj1CfqHD9+nOPHj0+O6630+0C8mZXGQdJFeip9jFV/zoYt97Nhy/2Mj4237DnYoWJmRXKQdKGeSt9kqFx5y7cmew7u6e3j6NGjHD161Lu7zKwwDpJZeum+IZ31v/yeSt9kP131iTHe//m/4NixY97dZWaFcZDMUq1WY3jrA0S9836Um/101esB0uT9TWq1Gu/57J8xOjrKz372M44fPz55sN7hYmbzxdeRzEFvpW9yN1Kn6an0MTE+RtTrk1spo6OjRATDtz5AT08vvX39bLvq17hmx8N8+bo19Pf3n3Atiq9PMbO58hbJAtU8y2u8VkM9vY2gmRjn/Vu/A6rwwgsvMDo6yhU37+bYsWO+3a+ZnTJvkSxg2TsvToyPTQ7XXvx7Nmy5H4CBV71mcpfXiy++SK1Wm+y+vnlzrb6+PsbGxryVYmYtOUgWqWbINE8nzqoMnMYLL7zAwMAA1+x4mNuGL+aD2/fwxY1v4fTTT588xbhJEgMDA0Bji8bBY7a4qNsPukpaC/wh0AvcFhE3zjT90NBQjIyMzHk51WqVd33mG7M+RtJb6aNenyDq9Tkv61TM9/LqaQump9JHfXysETwRfPa9v8p1X3qEqMfkMqUebh2++ITg2XHNm+nv7598z+atg5u7zaYLn2abg8isc0h6JCKGWo7r5iCR1Av8H+BtwEHgYeC9EfHkdPM4SPKZDJUpy2wVPPXxsZdN09NbmQwiqZdbhy8GYNNdj7HtqtV8cPsePr/hjQBcs+PhE4KoCFPDrr+/n7GxsZZbWdkTE+ClQMwGpYPQFoqFHCSXAJ+IiMvS6xsAIuK/TDdPniB556e+Tn2RB0neZWaDaLrwybZl9VT6iDZ+xp5KH5X+AT535RDXfOFBAD5/1SV8aPt3uXV4Nb975yNs+8ClDAwMMDAwQLVa5X1b7mfbBy4F4Orb/4ptH7iU9299APVU+NKmX5/c6jLrBHm+jws5SN4FrI2ID6bXVwK/FhHXTZluI7AxvXwdsP8UF3kW8LenOG87ua6569TaOrUu6NzaOrUu6NzaTqWuX4qIwVYjuv1ge6v9BickY0RsBbbmXpg0Ml0il8l1zV2n1tapdUHn1tapdUHn1jbfdXX7dSQHgXMzr5cDz5dUi5nZotTtQfIwsErSSkn9wHpgV8k1mZktKl29aysixiVdB/wpjdN/b4+IvW1cZO7dY23iuuauU2vr1Lqgc2vr1Lqgc2ub17q6+mC7mZmVr9t3bZmZWckcJGZmlouDZBYkrZW0X9IBSdcXvOxzJd0vaZ+kvZI+nNo/IemvJT2WHm/PzHNDqnW/pMvaXN+PJT2eahhJbWdKuk/SU+n5jCJrk/S6zHp5TNJRSR8pa51Jul3SEUlPZNrmvI4kXZTW9QFJNynnZfPT1PXfJP1Q0g8kfU3Sa1P7CkkvZtbd59pV1wy1zfnvV9A6+3Kmph9Leiy1F7bOZvidKOZ7FhF+zPCgcRD/aeA8oB/4PnB+gctfCrwhDb+aRpcw5wOfAP59i+nPTzUOACtT7b1trO/HwFlT2v4rcH0avh74ZBm1Zf5+fwP8UlnrDHgr8AbgiTzrCNgDXELj+ql7gd9sQ13/HKik4U9m6lqRnW7K+8xrXTPUNue/XxHrbMr4/w7856LXGdP/ThTyPfMWycldDByIiB9FRA24G1hX1MIj4lBEPJqGjwH7gGUzzLIOuDsiqhHxDHCAxmco0jpgexreDlxeYm1rgKcj4iczTNPWuiLi28DftVjmrNeRpKXAkoh4MBr/2ndk5pm3uiLimxExnl5+l8a1WdNqR13T1TaDUtdZU/qf+3uAu2Z6jzbVNd3vRCHfMwfJyS0Dnsu8PsjMP+RtI2kF8HrgodR0XdoFcXtmk7XoegP4pqRH1OiKBuCciDgEjS84cHZJtUHj2qLsP+xOWGcw93W0LA0XWeMHaPyPtGmlpO9JekDSW1Jb0XXN5e9XdG1vAQ5HxFOZtsLX2ZTfiUK+Zw6Sk5tVNyxtL0I6HfgT4CMRcRTYAvxD4FeBQzQ2qaH4et8UEW8AfhPYJOmtM0xbaG1qXKT6DuCPU1OnrLOZTFdL0evuY8A4cGdqOgT8g4h4PfBR4EuSlhRc11z/fkX/Xd/Ly//TUvg6a/E7Me2k09RwSrU5SE6u9G5YJPXR+HLcGRFfBYiIwxExERF14PO8tCum0Hoj4vn0fAT4WqrjcNpEbm7GHymjNhrh9mhEHE41dsQ6S+a6jg7y8t1MbatR0jDwW8DvpN0bpF0gP03Dj9DYp/6PiqzrFP5+Ra6zCvDbwJcz9Ra6zlr9TlDQ98xBcnKldsOS9rtuA/ZFxKcz7Uszk70TaJ5FsgtYL2lA0kpgFY2DZ+2o7VWSXt0cpnGg9olUw3CabBi4p+jakpf9D7ET1lnGnNZR2i1xTNLq9J3YkJln3qhxo7j/CLwjIn6eaR9U4/4/SDov1fWjoupKy53T36/I2oDfAH4YEZO7hYpcZ9P9TlDU9yzPmQKL5QG8ncZZEE8DHyt42W+msWn5A+Cx9Hg78EXg8dS+C1iamedjqdb9zMMZNDPUdh6NMz++D+xtrhvgF4DdwFPp+cwSansl8FPgNZm2UtYZjTA7BIzR+B/f1aeyjoAhGj+eTwM3k3qmmOe6DtDYd978rn0uTfuv0t/4+8CjwL9sV10z1Dbnv18R6yy13wH8mynTFrbOmP53opDvmbtIMTOzXLxry8zMcnGQmJlZLg4SMzPLxUFiZma5OEjMzCwXB4mZmeXiIDEzs1z+P8MCdA12RDTAAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "sns.histplot(bbox_l_stat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "53036\n",
      "220722\n"
     ]
    }
   ],
   "source": [
    "x = np.array(bbox_l_stat)\n",
    "print(np.sum(x < 32))\n",
    "print(len(bbox_l_stat))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'image_id': '0016_74957599317_7a8ac4d3dd5d4fe70b5544624de32611',\n",
       " 'point_seq_id': '50b4ced2-b0cf-11eb-a39c-0242ac1e3f02',\n",
       " 'name': '白铁加工',\n",
       " 'texts': ['白铁加工', '13839651979'],\n",
       " 'areas': [0.4783580283815136, 0.08641843517371935],\n",
       " 'min_dist': 0,\n",
       " 'rankorder': [0, -1],\n",
       " 'board_bbox': [429, 344, 816, 569],\n",
       " 'origin_bboxs': [[452, 381, 781, 518], [585, 502, 784, 554]],\n",
       " 'normlize_bboxs': [[0.059431524547803614,\n",
       "   0.09560723514211886,\n",
       "   0.9095607235142119,\n",
       "   0.4496124031007752],\n",
       "  [0.40310077519379844,\n",
       "   0.4082687338501292,\n",
       "   0.917312661498708,\n",
       "   0.5426356589147286]],\n",
       " 'contour': [[[768, 381],\n",
       "   [781, 382],\n",
       "   [781, 478],\n",
       "   [476, 518],\n",
       "   [459, 518],\n",
       "   [452, 515],\n",
       "   [452, 434],\n",
       "   [460, 424],\n",
       "   [465, 419],\n",
       "   [467, 418],\n",
       "   [472, 417],\n",
       "   [489, 414],\n",
       "   [495, 413],\n",
       "   [526, 408],\n",
       "   [558, 403],\n",
       "   [591, 398],\n",
       "   [632, 393],\n",
       "   [763, 381]],\n",
       "  [[784, 502],\n",
       "   [784, 531],\n",
       "   [780, 532],\n",
       "   [775, 533],\n",
       "   [769, 534],\n",
       "   [687, 544],\n",
       "   [654, 548],\n",
       "   [603, 554],\n",
       "   [591, 554],\n",
       "   [590, 552],\n",
       "   [585, 537],\n",
       "   [585, 529],\n",
       "   [594, 525],\n",
       "   [771, 502]]],\n",
       " 'board_contour': [[816, 344], [429, 403], [429, 543], [569, 569], [816, 540]]}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_train_data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "folder = \"./preprocess_data\"\n",
    "if not os.path.exists(folder):\n",
    "    os.makedirs(folder)\n",
    "with open(os.path.join(folder, \"train.json\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(json.dumps(new_train_data, ensure_ascii=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/public_TestB.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    test_data = json.load(f)[\"data\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'dict'>\n"
     ]
    }
   ],
   "source": [
    "print(type(test_data))\n",
    "test_keys = list(test_data.keys())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'point_seq_id': '6a92e0b8-8043-4ad5-91b0-f8110f3b19c1_758326cd8eb0c24bdc70906a08128f20', 'image_id': '758326cd8eb0c24bdc70906a08128f20', 'board_contour': [[757, 701], [215, 719], [208, 491], [289, 477], [749, 544]], 'texts': [{'text': '兰霞制衣', 'contour': [[728, 640], [699, 640], [335, 618], [325, 617], [322, 616], [314, 613], [314, 563], [319, 522], [322, 519], [324, 518], [341, 518], [700, 559], [708, 560], [713, 561], [717, 562], [721, 564], [728, 568]]}, {'text': '修改各种服装', 'contour': [[758, 700], [753, 701], [743, 701], [553, 695], [526, 694], [523, 693], [521, 692], [520, 691], [519, 663], [519, 647], [521, 645], [538, 645], [607, 648], [671, 651], [748, 655], [752, 656], [755, 659], [758, 687]]}]}\n"
     ]
    }
   ],
   "source": [
    "key = random.choice(test_keys)\n",
    "print(test_data[key])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理测试数据\n",
    "# 主要涉及重复字符去除、边框归一化\n",
    "def test_data_preprocess(msg: dict) -> dict:\n",
    "    # 计算每个多边形面积并归一化\n",
    "    total_areas = get_area(msg[\"board_contour\"])\n",
    "    areas = [get_area(x[\"contour\"]) for x in msg[\"texts\"]]\n",
    "    areas = [x / total_areas for x in areas]\n",
    "    # 获取text信息\n",
    "    texts = [x[\"text\"] for x in msg[\"texts\"]]\n",
    "\n",
    "    # 获取每个OCR的bbox并归一化\n",
    "    board_top_left, board_bottom_right, origin_bboxs, normlize_bboxs = normlize_contour(msg)\n",
    "\n",
    "    return {\n",
    "        \"image_id\": msg[\"image_id\"],\n",
    "        \"point_seq_id\": msg[\"point_seq_id\"],\n",
    "        \"texts\": texts,\n",
    "        \"areas\": areas,\n",
    "        \"board_bbox\": [board_top_left[0], board_top_left[1], board_bottom_right[0], board_bottom_right[1]],\n",
    "        \"origin_bboxs\": origin_bboxs,\n",
    "        \"normlize_bboxs\": normlize_bboxs,\n",
    "        \"contour\": [x[\"contour\"] for x in msg[\"texts\"]],\n",
    "        \"board_contour\": msg[\"board_contour\"]\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 11407/11407 [00:03<00:00, 3531.91it/s]\n"
     ]
    }
   ],
   "source": [
    "new_test_data = [test_data_preprocess(test_data[k]) for k in tqdm(test_data)]\n",
    "with open(os.path.join(folder, \"test.json\"), \"w\", encoding=\"utf-8\") as f:\n",
    "    f.write(json.dumps(new_test_data, ensure_ascii=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "c292224ab756d3f2beb5050e3fd1c533f7cdb96e405ecde17472018763e8dd24"
  },
  "kernelspec": {
   "display_name": "Python 3.9.6 64-bit ('py39': conda)",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
